In [1]:
import os

os.listdir()
Out[1]:
['.ipynb_checkpoints',
 'train',
 'train_list.txt',
 'v1.ipynb',
 'val',
 'val_list.txt']
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [3]:
with open('train_list.txt') as f:
    contents = f.readlines()

lines_train = [line.strip() for line in contents] 
In [4]:
print('Number of label in training data:',len(lines_train))
Number of label in training data: 12686
In [5]:
from collections import defaultdict
In [6]:
%%time

d = defaultdict(list)

image_path = os.listdir('train/')

for i in range(len(image_path)):
    for j in range(len(lines_train)):
        #print(lines_train[j].split('lines')[1].split('.png')[0].split('/')[3])
        if image_path[i] == lines_train[j].split('lines')[1].split('.png')[0].split('/')[3] + '.png':
            d['image_path'].append('train/' + image_path[i])
            d['label'].append(lines_train[j].split('.png\t')[1])
            break     
    
train_df = pd.DataFrame(d)
Wall time: 1min 16s
In [7]:
train_df.head()
Out[7]:
image_path label
0 train/a01-000u-00.png A MOVE to stop Mr. Gaitskell from
1 train/a01-000u-01.png nominating any more Labour life Peers
2 train/a01-000u-02.png is to be made at a meeting of Labour
3 train/a01-000u-03.png M
4 train/a01-000u-04.png put down a resolution on the subject
In [8]:
train_df.shape
Out[8]:
(12686, 2)
In [9]:
import random
import cv2

def Rand(start, end, num):
    res = []
  
    for j in range(num):
        res.append(random.randint(start, end))
  
    return res
In [10]:
rand_num_list = Rand(0, 8000, 100)

for i in rand_num_list:
    img = cv2.imread(train_df['image_path'][i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    plt.figure(figsize=(20, 20))
    plt.subplot(1, 2, 1)
    plt.title(train_df['label'][i])
    plt.imshow(img)

    kernel_sharpening = np.array([[-1,-1,-1], 
                                  [-1,9,-1], 
                                  [-1,-1,-1]])


    sharpened = cv2.filter2D(img, -1, kernel_sharpening)


    plt.subplot(1, 2, 2)
    plt.title(train_df['label'][i])
    plt.imshow(sharpened)
    plt.show()
In [11]:
import string

max_label_len = 0

#char_list = "!\"#&@'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
char_list = string.printable + string.ascii_letters + string.digits

print(char_list, len(char_list))

def encode_to_labels(txt):
    # encoding each output word into digits
    dig_lst = []
    for index, chara in enumerate(txt):
        dig_lst.append(char_list.index(chara))
        
    return dig_lst
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 162
In [12]:
images = []
labels = []

RECORDS_COUNT = train_df.shape[0]

train_images = []
train_labels = []
train_input_length = []
train_label_length = []
train_original_text = []

valid_images = []
valid_labels = []
valid_input_length = []
valid_label_length = []
valid_original_text = []

inputs_length = []
labels_length = []
In [13]:
def process_image(img):
    """
    Converts image to shape (150, 1000, 1) & normalize
    """
    w, h = img.shape

    # Aspect Ratio Calculation
    new_w = 150
    new_h = int(h * (new_w / w))
    img = cv2.resize(img, (new_h, new_w))
    w, h = img.shape
    
    img = img.astype('float32')
    
    # Converts each to (32, 128, 1)
    if w < 150:
        add_zeros = np.full((150-w, h), 255)
        img = np.concatenate((img, add_zeros))
        w, h = img.shape
    
    if h < 1000:
        add_zeros = np.full((w, 128-h), 255)
        img = np.concatenate((img, add_zeros), axis=1)
        w, h = img.shape
        
    if h > 1000 or w > 150:
        dim = (1000,150)
        img = cv2.resize(img, dim)
    
    img = cv2.subtract(255, img)
    
    img = np.expand_dims(img, axis=2)
    
    # Normalize 
    img = img / 255
    
    return img
In [14]:
%%time

i = 0
for index, line in enumerate(lines_train):
    filepath = train_df.loc[i, 'image_path']
    word = train_df.loc[i, 'label']
    # process image
    img = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
    try:
        img = process_image(img)
    except:
        i += 1
        continue
        
    # process label
    #print(train_df.loc[i, 'label'])
    label = encode_to_labels(word)

    if index % 8 == 0:
        valid_images.append(img)
        valid_labels.append(label)
        valid_input_length.append(150)
        valid_label_length.append(len(word))
        valid_original_text.append(word)
    else:
        train_images.append(img)
        train_labels.append(label)
        train_input_length.append(150)
        train_label_length.append(len(word))
        train_original_text.append(word)

    if len(word) > max_label_len:
        max_label_len = len(word)
    
    i += 1
    
    if i >= RECORDS_COUNT:
        break
Wall time: 35.6 s
In [15]:
len(train_images), len(valid_images), len(train_labels), len(valid_labels)
Out[15]:
(10735, 1544, 10735, 1544)
In [16]:
!pip install keras_tqdm
Requirement already satisfied: keras_tqdm in c:\users\dawaaii\anaconda3\lib\site-packages (2.0.1)
Requirement already satisfied: tqdm in c:\users\dawaaii\anaconda3\lib\site-packages (from keras_tqdm) (4.59.0)
Requirement already satisfied: Keras in c:\users\dawaaii\anaconda3\lib\site-packages (from keras_tqdm) (2.4.3)
Requirement already satisfied: pyyaml in c:\users\dawaaii\anaconda3\lib\site-packages (from Keras->keras_tqdm) (5.4.1)
Requirement already satisfied: scipy>=0.14 in c:\users\dawaaii\anaconda3\lib\site-packages (from Keras->keras_tqdm) (1.6.2)
Requirement already satisfied: numpy>=1.9.1 in c:\users\dawaaii\anaconda3\lib\site-packages (from Keras->keras_tqdm) (1.19.5)
Requirement already satisfied: h5py in c:\users\dawaaii\anaconda3\lib\site-packages (from Keras->keras_tqdm) (3.1.0)
In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional, Flatten, Activation, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu, sigmoid, softmax
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
#from keras_tqdm import TQDMNotebookCallback

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
In [18]:
import tensorflow as tf
from tensorflow.python.client import device_lib

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Check all available devices if GPU is available
print(device_lib.list_local_devices())
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2871999445918458268
]
Device mapping: no known devices.
In [19]:
tf.config.experimental.list_physical_devices('GPU')
Out[19]:
[]
In [20]:
%%time

train_padded_label = pad_sequences(train_labels, 
                             maxlen=max_label_len, 
                             padding='post',
                             value=len(char_list))

valid_padded_label = pad_sequences(valid_labels, 
                             maxlen=max_label_len, 
                             padding='post',
                             value=len(char_list))
Wall time: 87.7 ms
In [21]:
%%time

train_images = np.asarray(train_images)
train_input_length = np.asarray(train_input_length)
train_label_length = np.asarray(train_label_length)

valid_images = np.asarray(valid_images)
valid_input_length = np.asarray(valid_input_length)
valid_label_length = np.asarray(valid_label_length)
Wall time: 4.72 s
In [22]:
input_data = Input(shape=(150, 1000, 1), dtype='float32')
input_data
Out[22]:
<KerasTensor: shape=(None, 150, 1000, 1) dtype=float32 (created by layer 'input_1')>
In [23]:
inner = Conv2D(64, (3, 5), padding='same', name='conv1', kernel_initializer='he_normal')(input_data)
inner
Out[23]:
<KerasTensor: shape=(None, 150, 1000, 64) dtype=float32 (created by layer 'conv1')>
In [24]:
inner = BatchNormalization()(inner)
inner
Out[24]:
<KerasTensor: shape=(None, 150, 1000, 64) dtype=float32 (created by layer 'batch_normalization')>
In [25]:
inner = Activation('relu')(inner)
inner
Out[25]:
<KerasTensor: shape=(None, 150, 1000, 64) dtype=float32 (created by layer 'activation')>
In [26]:
inner = MaxPooling2D(pool_size=(1, 2), name='max1')(inner)  # (None,64, 32, 64)
inner
Out[26]:
<KerasTensor: shape=(None, 150, 500, 64) dtype=float32 (created by layer 'max1')>
In [27]:
inner = Conv2D(128, (3, 5), padding='same', name='conv2', kernel_initializer='he_normal')(inner)  # (None, 64, 32, 128)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max2')(inner)
In [28]:
inner
Out[28]:
<KerasTensor: shape=(None, 150, 250, 128) dtype=float32 (created by layer 'max2')>
In [29]:
inner = Conv2D(256, (3, 5), padding='same', name='conv3', kernel_initializer='he_normal')(inner)  # (None, 32, 16, 256)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = Conv2D(256, (3, 5), padding='same', name='conv4', kernel_initializer='he_normal')(inner)  # (None, 32, 16, 256)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max3')(inner)
In [30]:
inner
Out[30]:
<KerasTensor: shape=(None, 150, 125, 256) dtype=float32 (created by layer 'max3')>
In [31]:
inner = Conv2D(512, (3, 5), padding='same', name='conv5', kernel_initializer='he_normal')(inner)  # (None, 32, 8, 512)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = Conv2D(512, (3, 5), padding='same', name='conv6')(inner)  # (None, 32, 8, 512)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max4')(inner)
In [32]:
inner
Out[32]:
<KerasTensor: shape=(None, 150, 62, 512) dtype=float32 (created by layer 'max4')>
In [33]:
inner = Conv2D(512, (3, 5), padding='same', kernel_initializer='he_normal', name='con7')(inner)  # (None, 32, 4, 512)
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
In [34]:
inner
Out[34]:
<KerasTensor: shape=(None, 150, 62, 512) dtype=float32 (created by layer 'activation_6')>
In [35]:
inner = Reshape(target_shape=((150, 62*512)), name='reshape')(inner)  # (None, 32, 2048)
inner = Dense(64, activation='relu', kernel_initializer='he_normal', name='dense1')(inner)
In [36]:
inner
Out[36]:
<KerasTensor: shape=(None, 150, 64) dtype=float32 (created by layer 'dense1')>
In [38]:
from keras.layers.merge import add, concatenate
In [39]:
lstm_1 = LSTM(256, return_sequences=True, kernel_initializer='he_normal', name='lstm1')(inner)  # (None, 32, 512)
lstm_1b = LSTM(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm1_b')(inner)
reversed_lstm_1b = Lambda(lambda inputTensor: K.reverse(inputTensor, axes=1)) (lstm_1b)
In [40]:
reversed_lstm_1b
Out[40]:
<KerasTensor: shape=(None, 150, 256) dtype=float32 (created by layer 'lambda')>
In [41]:
lstm1_merged = add([lstm_1, reversed_lstm_1b])  # (None, 32, 512)
lstm1_merged = BatchNormalization()(lstm1_merged)
In [42]:
lstm_2 = LSTM(256, return_sequences=True, kernel_initializer='he_normal', name='lstm2')(lstm1_merged)
lstm_2b = LSTM(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm_b')(lstm1_merged)
reversed_lstm_2b= Lambda(lambda inputTensor: K.reverse(inputTensor, axes=1)) (lstm_2b)
In [43]:
reversed_lstm_2b
Out[43]:
<KerasTensor: shape=(None, 150, 256) dtype=float32 (created by layer 'lambda_1')>
In [44]:
lstm2_merged = concatenate([lstm_2, reversed_lstm_2b])  # (None, 32, 1024)
lstm2_merged = BatchNormalization()(lstm2_merged)
In [45]:
#Dense(len(char_list)+1, activation = 'softmax')(blstm_2)
inner = Dense(len(char_list)+1, kernel_initializer='he_normal',name='dense2')(lstm2_merged) #(None, 32, 80)
y_pred = Activation('softmax', name='softmax')(inner)
In [46]:
y_pred
Out[46]:
<KerasTensor: shape=(None, 150, 163) dtype=float32 (created by layer 'softmax')>
In [47]:
act_model = Model(input_data, y_pred)
In [48]:
act_model.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 150, 1000, 1 0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 150, 1000, 64 1024        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 150, 1000, 64 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 150, 1000, 64 0           batch_normalization[0][0]        
__________________________________________________________________________________________________
max1 (MaxPooling2D)             (None, 150, 500, 64) 0           activation[0][0]                 
__________________________________________________________________________________________________
conv2 (Conv2D)                  (None, 150, 500, 128 123008      max1[0][0]                       
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 150, 500, 128 512         conv2[0][0]                      
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 150, 500, 128 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
max2 (MaxPooling2D)             (None, 150, 250, 128 0           activation_1[0][0]               
__________________________________________________________________________________________________
conv3 (Conv2D)                  (None, 150, 250, 256 491776      max2[0][0]                       
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 150, 250, 256 1024        conv3[0][0]                      
__________________________________________________________________________________________________
activation_2 (Activation)       (None, 150, 250, 256 0           batch_normalization_2[0][0]      
__________________________________________________________________________________________________
conv4 (Conv2D)                  (None, 150, 250, 256 983296      activation_2[0][0]               
__________________________________________________________________________________________________
batch_normalization_3 (BatchNor (None, 150, 250, 256 1024        conv4[0][0]                      
__________________________________________________________________________________________________
activation_3 (Activation)       (None, 150, 250, 256 0           batch_normalization_3[0][0]      
__________________________________________________________________________________________________
max3 (MaxPooling2D)             (None, 150, 125, 256 0           activation_3[0][0]               
__________________________________________________________________________________________________
conv5 (Conv2D)                  (None, 150, 125, 512 1966592     max3[0][0]                       
__________________________________________________________________________________________________
batch_normalization_4 (BatchNor (None, 150, 125, 512 2048        conv5[0][0]                      
__________________________________________________________________________________________________
activation_4 (Activation)       (None, 150, 125, 512 0           batch_normalization_4[0][0]      
__________________________________________________________________________________________________
conv6 (Conv2D)                  (None, 150, 125, 512 3932672     activation_4[0][0]               
__________________________________________________________________________________________________
batch_normalization_5 (BatchNor (None, 150, 125, 512 2048        conv6[0][0]                      
__________________________________________________________________________________________________
activation_5 (Activation)       (None, 150, 125, 512 0           batch_normalization_5[0][0]      
__________________________________________________________________________________________________
max4 (MaxPooling2D)             (None, 150, 62, 512) 0           activation_5[0][0]               
__________________________________________________________________________________________________
con7 (Conv2D)                   (None, 150, 62, 512) 3932672     max4[0][0]                       
__________________________________________________________________________________________________
batch_normalization_6 (BatchNor (None, 150, 62, 512) 2048        con7[0][0]                       
__________________________________________________________________________________________________
activation_6 (Activation)       (None, 150, 62, 512) 0           batch_normalization_6[0][0]      
__________________________________________________________________________________________________
reshape (Reshape)               (None, 150, 31744)   0           activation_6[0][0]               
__________________________________________________________________________________________________
dense1 (Dense)                  (None, 150, 64)      2031680     reshape[0][0]                    
__________________________________________________________________________________________________
lstm1_b (LSTM)                  (None, 150, 256)     328704      dense1[0][0]                     
__________________________________________________________________________________________________
lstm1 (LSTM)                    (None, 150, 256)     328704      dense1[0][0]                     
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 150, 256)     0           lstm1_b[0][0]                    
__________________________________________________________________________________________________
add (Add)                       (None, 150, 256)     0           lstm1[0][0]                      
                                                                 lambda[0][0]                     
__________________________________________________________________________________________________
batch_normalization_7 (BatchNor (None, 150, 256)     1024        add[0][0]                        
__________________________________________________________________________________________________
lstm_b (LSTM)                   (None, 150, 256)     525312      batch_normalization_7[0][0]      
__________________________________________________________________________________________________
lstm2 (LSTM)                    (None, 150, 256)     525312      batch_normalization_7[0][0]      
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 150, 256)     0           lstm_b[0][0]                     
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 150, 512)     0           lstm2[0][0]                      
                                                                 lambda_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_8 (BatchNor (None, 150, 512)     2048        concatenate[0][0]                
__________________________________________________________________________________________________
dense2 (Dense)                  (None, 150, 163)     83619       batch_normalization_8[0][0]      
__________________________________________________________________________________________________
softmax (Activation)            (None, 150, 163)     0           dense2[0][0]                     
==================================================================================================
Total params: 15,266,403
Trainable params: 15,260,387
Non-trainable params: 6,016
__________________________________________________________________________________________________
In [ ]:
 
In [ ]:
 
In [49]:
# # input with shape of height=150 and width=1000
# inputs = Input(shape=(150,1000,1))
 
# # convolution layer with kernel size (3,3)
# conv_1 = Conv2D(64, (3,5), activation = 'relu', padding='same')(inputs)
# # poolig layer with kernel size (2,2)
# pool_1 = MaxPooling2D((2,2), padding='same')(conv_1)
 
# conv_2 = Conv2D(128, (3,5), activation = 'relu', padding='same')(pool_1)
# pool_2 = MaxPooling2D((2,2), padding='same')(conv_2)
 
# conv_3 = Conv2D(256, (3,5), activation = 'relu', padding='same')(pool_2)
 
# conv_4 = Conv2D(256, (3,5), activation = 'relu', padding='same')(conv_3)
# # poolig layer with kernel size (2,1)
# pool_4 = MaxPooling2D((2,2), padding='same')(conv_4)
 
# conv_5 = Conv2D(512, (3,5), activation = 'relu', padding='same')(pool_4)
# # Batch normalization layer
# batch_norm_5 = BatchNormalization()(conv_5)
 
# conv_6 = Conv2D(512, (3,5), activation = 'relu', padding='same')(batch_norm_5)
# batch_norm_6 = BatchNormalization()(conv_6)
# pool_6 = MaxPooling2D((2,2), padding='same')(batch_norm_6)
 
# conv_7 = Conv2D(512, (3,5), activation = 'relu')(pool_6)
 
# squeezed = Flatten()(conv_7)
 
# # bidirectional LSTM layers with units=128
# blstm_1 = LSTM(256, return_sequences=True, dropout = 0.2)(squeezed)
# blstm_2 = LSTM(256, return_sequences=True, dropout = 0.2)(blstm_1)
 
# outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

# # model to be used at test time
# act_model = Model(inputs, outputs)
In [50]:
the_labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, the_labels, input_length, label_length])

#model to be used at training time
model = Model(inputs=[input_data, the_labels, input_length, label_length], outputs=loss_out)
In [51]:
batch_size = 8
epochs = 5
e = str(epochs)
optimizer_name = 'adam'
In [52]:
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = optimizer_name, metrics=['accuracy'])

filepath="{}o-{}r-{}e-{}t-{}v.hdf5".format(optimizer_name,
                                          str(RECORDS_COUNT),
                                          str(epochs),
                                          str(train_images.shape[0]),
                                          str(valid_images.shape[0]))

checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]
In [ ]:
history = model.fit(x=[train_images, train_padded_label, train_input_length, train_label_length],
                    y=np.zeros(len(train_images)),
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=([valid_images, valid_padded_label, valid_input_length, valid_label_length], [np.zeros(len(valid_images))]),
                    verbose=1,callbacks=callbacks_list)
Epoch 1/5
1342/1342 [==============================] - 72727s 54s/step - loss: 158.3878 - accuracy: 0.0000e+00 - val_loss: 139.0172 - val_accuracy: 0.0000e+00

Epoch 00001: val_loss improved from inf to 139.01720, saving model to adamo-12686r-5e-10735t-1544v.hdf5
C:\Users\dawaaii\anaconda3\lib\site-packages\tensorflow\python\keras\utils\generic_utils.py:494: CustomMaskWarning: Custom mask layers require a config and must override get_config. When loading, the custom mask layer must be passed to the custom_objects argument.
  warnings.warn('Custom mask layers require a config and must override '
Epoch 2/5
 455/1342 [=========>....................] - ETA: 13:03:29 - loss: 136.0808 - accuracy: 0.0000e+00
In [ ]:
# prediction = act_model.predict(train_images[150:170])
 
# # use CTC decoder
# decoded = K.ctc_decode(prediction,   
#                        input_length=np.ones(prediction.shape[0]) * prediction.shape[1],
#                        greedy=True)[0][0]

# out = K.get_value(decoded)

# # see the results
# for i, x in enumerate(out):
#     print("original_text =  ", train_original_text[150+i])
#     print("predicted text = ", end = '')
#     for p in x:
#         if int(p) != -1:
#             print(char_list[int(p)], end = '')
#     plt.imshow(train_images[150+i].reshape(32,128), cmap=plt.cm.gray)
#     plt.show()
#     print('\n')
In [ ]:
 
In [ ]:
 
In [ ]: